{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# 各种分布\n",
    "1. 字数分布：字数区间占比环形图（如50万以下/50-100万/100万以上）\n",
    "2. 类型分布：小说类型占比饼图\n",
    "3. 分级统计：各类型平均字数对比水平条形图\n",
    "4. 评分分布：绘制正态分布曲线"
   ],
   "id": "bd97b8b4af36a696"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T10:41:26.433819Z",
     "start_time": "2025-02-26T10:41:26.052714Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from pymongo import MongoClient\n",
    "# 连接到MongoDB\n",
    "client = MongoClient('mongodb://localhost:27017/')\n",
    "# 打开learn数据库\n",
    "db = client['learn']\n",
    "# 在数据库中打开seven_cats集合\n",
    "books = db['seven_cats']"
   ],
   "id": "fd34b6a45c60581f",
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T10:41:28.240334Z",
     "start_time": "2025-02-26T10:41:27.441285Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "# 将MongoDB数据转换为pandas数据\n",
    "add = pd.DataFrame(list(books.find({}, {'_id': 0, 'wordcount': 1, 'type': 1, 'serialised': 1, 'score': 1})))\n",
    "add['count'] = 1\n",
    "add"
   ],
   "id": "8a02b131266fa440",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      score serialised  type  wordcount  count\n",
       "0       9.3        连载中    都市     272.47      1\n",
       "1       9.1        连载中  武侠仙侠      38.60      1\n",
       "2       8.8        连载中  玄幻奇幻      61.96      1\n",
       "3       9.3        连载中  玄幻奇幻     664.36      1\n",
       "4       9.1        连载中  玄幻奇幻     552.87      1\n",
       "...     ...        ...   ...        ...    ...\n",
       "7713    9.3         完结    科幻     329.66      1\n",
       "7714    8.9         完结    科幻     424.60      1\n",
       "7715    9.1        连载中    都市     135.02      1\n",
       "7716    9.0         完结   N次元      82.74      1\n",
       "7717    9.0         完结   N次元      85.03      1\n",
       "\n",
       "[7718 rows x 5 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>score</th>\n",
       "      <th>serialised</th>\n",
       "      <th>type</th>\n",
       "      <th>wordcount</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>连载中</td>\n",
       "      <td>都市</td>\n",
       "      <td>272.47</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.1</td>\n",
       "      <td>连载中</td>\n",
       "      <td>武侠仙侠</td>\n",
       "      <td>38.60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.8</td>\n",
       "      <td>连载中</td>\n",
       "      <td>玄幻奇幻</td>\n",
       "      <td>61.96</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.3</td>\n",
       "      <td>连载中</td>\n",
       "      <td>玄幻奇幻</td>\n",
       "      <td>664.36</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.1</td>\n",
       "      <td>连载中</td>\n",
       "      <td>玄幻奇幻</td>\n",
       "      <td>552.87</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7713</th>\n",
       "      <td>9.3</td>\n",
       "      <td>完结</td>\n",
       "      <td>科幻</td>\n",
       "      <td>329.66</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7714</th>\n",
       "      <td>8.9</td>\n",
       "      <td>完结</td>\n",
       "      <td>科幻</td>\n",
       "      <td>424.60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7715</th>\n",
       "      <td>9.1</td>\n",
       "      <td>连载中</td>\n",
       "      <td>都市</td>\n",
       "      <td>135.02</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7716</th>\n",
       "      <td>9.0</td>\n",
       "      <td>完结</td>\n",
       "      <td>N次元</td>\n",
       "      <td>82.74</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7717</th>\n",
       "      <td>9.0</td>\n",
       "      <td>完结</td>\n",
       "      <td>N次元</td>\n",
       "      <td>85.03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7718 rows × 5 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T10:41:28.535174Z",
     "start_time": "2025-02-26T10:41:28.425602Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 导入可视化工具\n",
    "from pyecharts import options as opts"
   ],
   "id": "efc970550a0e276a",
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## 一、字数分布：字数区间占比环形图\n",
    "（10万以下/10-30万/30-50万/50-100万/100-200万/200-300万/300-500万/500万以上）"
   ],
   "id": "a7ff5d72b57b585b"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:48.275696Z",
     "start_time": "2025-02-26T00:38:48.267318Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分箱分组\n",
    "agg = pd.concat([add['wordcount'], add['count']], axis=1)\n",
    "bins = [-float('inf'), 10, 30, 50, 100, 200, 300, 500, float('inf')]\n",
    "labels = ['10万以下', '10-30万', '30-50万', '50-100万', '100-200万', '200-300万', '300-500万', '500万以上']\n",
    "agg['word_interval'] = pd.cut(agg['wordcount'], bins=bins, labels=labels)\n",
    "grouped = agg.groupby('word_interval', observed=False)['count'].sum()\n",
    "grouped"
   ],
   "id": "3de69c43c930576f",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "word_interval\n",
       "10万以下        370\n",
       "10-30万      1551\n",
       "30-50万       938\n",
       "50-100万     1277\n",
       "100-200万    1873\n",
       "200-300万     706\n",
       "300-500万     593\n",
       "500万以上       410\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:49.290755Z",
     "start_time": "2025-02-26T00:38:49.258070Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 构建环形图\n",
    "from pyecharts.charts import Pie\n",
    "pie = Pie()\n",
    "pie.add(\n",
    "        \"\",\n",
    "        [list(z) for z in zip(grouped.index, grouped)],\n",
    "        radius=[\"40%\", \"70%\"],\n",
    ")\n",
    "pie.set_global_opts(\n",
    "        title_opts=opts.TitleOpts(title=\"七猫男生原创小说字数分布环形图\", pos_left=\"center\", pos_top=\"1%\"), \n",
    "        legend_opts=opts.LegendOpts(orient=\"vertical\", pos_top=\"20%\", pos_left=\"5%\", item_gap=20),\n",
    ")\n",
    "pie.set_series_opts(label_opts=opts.LabelOpts(color=\"auto\", font_size=14))\n",
    "pie.render(\"D:\\sevencats-novels-analysis\\images\\七猫男生原创小说字数分布.html\")"
   ],
   "id": "556029401a458180",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫男生原创小说字数分布.html'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## 二、类型分布：小说类型占比饼图",
   "id": "2e21d0294a0ee857"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:50.898982Z",
     "start_time": "2025-02-26T00:38:50.890149Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分组聚合\n",
    "agg = pd.concat([add['type'], add['count']], axis=1)\n",
    "grouped = agg.groupby('type', observed=False)['count'].sum()\n",
    "grouped"
   ],
   "id": "c0b8cdd134bd9bb2",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "type\n",
       "N次元      438\n",
       "体育        75\n",
       "军事        37\n",
       "历史      1043\n",
       "奇闻异事     237\n",
       "武侠仙侠     500\n",
       "游戏       135\n",
       "玄幻奇幻    1812\n",
       "现实题材      23\n",
       "科幻       336\n",
       "都市      3082\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:51.721543Z",
     "start_time": "2025-02-26T00:38:51.711418Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 构建饼图\n",
    "from pyecharts.charts import Pie\n",
    "pie = Pie()\n",
    "pie.add(\n",
    "        \"\",\n",
    "        [list(z) for z in zip(grouped.index, grouped)],\n",
    ")\n",
    "pie.set_global_opts(\n",
    "        title_opts=opts.TitleOpts(title=\"七猫男生原创小说类型分布饼图\", pos_left=\"center\", pos_top=\"1%\"), \n",
    "        legend_opts=opts.LegendOpts(orient=\"vertical\", pos_top=\"14%\", pos_left=\"10%\", item_gap=20),\n",
    ")\n",
    "pie.set_series_opts(label_opts=opts.LabelOpts(color=\"auto\", font_size=14))\n",
    "pie.render(\"D:\\sevencats-novels-analysis\\images\\七猫男生原创小说类型分布.html\")"
   ],
   "id": "d50291b6e48dcaf1",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫男生原创小说类型分布.html'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 三、分级统计：各类型平均字数对比水平条形图",
   "id": "6832994b6e0b88d3"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:52.824625Z",
     "start_time": "2025-02-26T00:38:52.816995Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分组聚合\n",
    "agg = pd.concat([add['type'], add['wordcount']], axis=1)\n",
    "grouped = agg.groupby('type', observed=False)['wordcount'].mean()\n",
    "sorting = grouped.astype(int).sort_values()\n",
    "sorting"
   ],
   "id": "3dcf74c8a31da7ef",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "type\n",
       "现实题材     38\n",
       "历史      106\n",
       "奇闻异事    115\n",
       "都市      134\n",
       "N次元     137\n",
       "体育      143\n",
       "科幻      152\n",
       "游戏      171\n",
       "军事      186\n",
       "玄幻奇幻    192\n",
       "武侠仙侠    196\n",
       "Name: wordcount, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 8
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:38:53.410994Z",
     "start_time": "2025-02-26T00:38:53.403358Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 构建水平条形图\n",
    "from pyecharts.charts import Bar\n",
    "from pyecharts.commons.utils import JsCode\n",
    "\n",
    "# 查阅各种资料进行修改所得\n",
    "# 创建渐变色函数（蓝紫渐变）\n",
    "color_js = JsCode('''(params) => {\n",
    "    return new echarts.graphic.LinearGradient(1, 0, 0, 0, \n",
    "    [{offset: 0, color: '#5470c6'}, {offset: 1, color: '#9a60b4'}])\n",
    "}''')\n",
    "\n",
    "bar = Bar(init_opts=opts.InitOpts(\n",
    "        bg_color=\"#f5f7fa\",  # 浅灰背景\n",
    "        animation_opts=opts.AnimationOpts(\n",
    "                animation_delay=500,  # 动画延迟\n",
    "                animation_easing=\"cubicOut\"  # 弹性动画\n",
    "        )\n",
    "))\n",
    "bar.add_xaxis(sorting.index.tolist())\n",
    "bar.add_yaxis(\n",
    "        '', sorting.tolist(), category_gap=\"15%\", \n",
    "        label_opts=opts.LabelOpts(\n",
    "                is_show=True, \n",
    "                position='right',\n",
    "                font_size=12, \n",
    "                font_family=\"Arial\", \n",
    "                color=\"#333\",\n",
    "        ), \n",
    "        itemstyle_opts=opts.ItemStyleOpts(\n",
    "                color=color_js,  # 渐变色\n",
    "                border_width=1.5,\n",
    "                border_color=\"#fff\",\n",
    "                opacity=0.9,\n",
    "        )\n",
    ")\n",
    "bar.reversal_axis()\n",
    "bar.set_global_opts(\n",
    "        title_opts=opts.TitleOpts(title=\"各类型平均字数对比水平条形图\", pos_left=\"center\", pos_top=\"3%\",),\n",
    "        xaxis_opts=opts.AxisOpts(\n",
    "                name=\"平均字数（单位：万字）\", \n",
    "                name_location=\"center\", \n",
    "                name_gap=30, \n",
    "                axislabel_opts=opts.LabelOpts(\n",
    "                        font_size=12,\n",
    "                )\n",
    "        ),\n",
    ")\n",
    "bar.render(\"D:\\sevencats-novels-analysis\\images\\七猫各类型平均字数对比.html\")"
   ],
   "id": "872a5bb1d378650b",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫各类型平均字数对比.html'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 四、评分分布：绘制正态分布曲线",
   "id": "4d24778d3f0312d5"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T10:41:34.673087Z",
     "start_time": "2025-02-26T10:41:34.660049Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分组聚合\n",
    "agg = pd.concat([add['score'], add['count']], axis=1)\n",
    "grouped = agg.groupby('score', observed=False)['count'].sum()\n",
    "grouped"
   ],
   "id": "1cc4900da4cf6b30",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "score\n",
       "6.1       1\n",
       "6.7       1\n",
       "7.0     401\n",
       "7.5       1\n",
       "7.7       1\n",
       "7.8       1\n",
       "8.0    1297\n",
       "8.1       8\n",
       "8.2       7\n",
       "8.3       9\n",
       "8.4     708\n",
       "8.5      69\n",
       "8.6     291\n",
       "8.7     287\n",
       "8.8     490\n",
       "8.9     634\n",
       "9.0     873\n",
       "9.1     739\n",
       "9.2     968\n",
       "9.3     432\n",
       "9.4     212\n",
       "9.5     148\n",
       "9.6     100\n",
       "9.7      30\n",
       "9.8      10\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T10:41:38.696723Z",
     "start_time": "2025-02-26T10:41:37.975236Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from pyecharts.charts import Scatter, Line\n",
    "from scipy.optimize import curve_fit\n",
    "import numpy as np\n",
    "\n",
    "# 构建散点图\n",
    "scatter = Scatter()\n",
    "scatter.add_xaxis(grouped.index.tolist())\n",
    "scatter.add_yaxis('作品数量', grouped.tolist(), symbol_size=10, label_opts=opts.LabelOpts(is_show=False))\n",
    "\n",
    "# 询问AI进行的高斯拟合\n",
    "# 构建拟合曲线\n",
    "ratings = grouped.index.to_numpy()\n",
    "works = grouped.values\n",
    "# --------------------- 高斯拟合 ---------------------\n",
    "def gaussian(x, a, mu, sigma):\n",
    "    \"\"\"高斯函数公式\n",
    "    a: 振幅（峰值高度）\n",
    "    mu: 均值（中心位置）\n",
    "    sigma: 标准差（分布宽度）\n",
    "    \"\"\"\n",
    "    return a * np.exp(-(x - mu)**2 / (2 * sigma**2))\n",
    "\n",
    "# 执行拟合（p0为初始猜测参数：振幅、均值、标准差）\n",
    "popt, pcov = curve_fit(gaussian, ratings, works, p0=[1300, 8, 0.5])\n",
    "fit_curve = gaussian(ratings, *popt)  # 生成拟合曲线数据\n",
    "\n",
    "line = Line()\n",
    "line.add_xaxis(ratings.tolist())\n",
    "line.add_yaxis(\n",
    "        \"高斯拟合\",\n",
    "        fit_curve.tolist(),\n",
    "        is_smooth=True,\n",
    "        is_symbol_show=False,\n",
    "        linestyle_opts=opts.LineStyleOpts(width=3, color=\"red\"), \n",
    "        label_opts=opts.LabelOpts(is_show=False)\n",
    ")\n",
    "\n",
    "# 组合图表\n",
    "scatter.overlap(line)\n",
    "scatter.set_global_opts(\n",
    "        title_opts=opts.TitleOpts(title=\"评分分布曲线\", pos_left=\"center\", pos_bottom=\"1%\"),\n",
    "        xaxis_opts=opts.AxisOpts(\n",
    "                name=\"评分\", type_=\"value\", splitline_opts=opts.SplitLineOpts(is_show=True), min_=6\n",
    "        ),\n",
    "        yaxis_opts=opts.AxisOpts(\n",
    "                name=\"作品数量\", \n",
    "                type_=\"value\",\n",
    "                axistick_opts=opts.AxisTickOpts(is_show=True),\n",
    "                splitline_opts=opts.SplitLineOpts(is_show=True),\n",
    "        ),\n",
    ")\n",
    "scatter.render('D:\\sevencats-novels-analysis\\images\\七猫评分分布曲线.html')"
   ],
   "id": "8287a56d9a9cd0c3",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫评分分布曲线.html'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
